from IPython.display import display, Image
Image("D:\Data_science\moviereview.png")
Online reviews are important because they have become a reference point for buyers across the globe and because so many people trust them when making purchase decisions.
Reviews are also important for Search Engine Optimization (SEO). Having positive reviews is also another way through which you can improve a website’s Search Engine visibility. The more that people talk about a brand online, the greater its visibility to Search Engines, such as Google, Yahoo and Bing.
For the audience and booking websites, analysing reviews is significant in understanding reviewer opinion about the film.
In movie booking websites, 90% of people first check out online reviews before purchasing tickets.
For the production house, analysing negative reviews can be useful for damage control.
from IPython.display import display, Image
Image("D:\Data_science\PHD\client.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\waltdisney.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\waltdisneymovies.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\disneyfamily.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\companies.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\disneyand20thcen.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\disneystorewebsite.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\disneyproducts.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\someaudiencereviews.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\someaudiencereviews2.png")
Disney is well know for remake, Sequel movies and animation.
Very successful in Animation movies
Not so successful in Live-Action movies
Usually Don’t release movie in Dec.
June-July is Jackpot month. They need to decide right combination of movies to release in June-July to meet the expectation and demand
They postpone the release date of few movies depends on the sentiment mix among audience.
Dumbo - Remake
Aladdin – Remake
Toy story 4- Sequel
EndGame - Avengers
The Lion King - Remake
Artemis Fowl – New – Live action
Mistress of Evil – Remake – Oct month
Frozen II – Sequel – Nov month
Feb – Untitled Live action - New
Mar – Onward – Remake
Mar – Mulan – Live action – remake of animated version.
Jun – Monsters – Remake
July – Jungle Cruise – Remake(Initially planned in Oct 2019).
Oct – Untitled Live action - Sequel
from IPython.display import display, Image
Image("D:\Data_science\PHD\lionkinghistory.png")
Walt Disney started making CGI movies of old cartoon based movies. wanted to know how is the customer responses in their trial so far!!
Overall sentiment of Audience about movie.
Frequently commented words to merchandise those words and themes.
When they can plan Sequel of Lion King.
How will the overall sentiment be
What they have to target
Any technical comments – Background music, song, voice.
Any sentiment comments – Violence, fights, pride, etc.
Sentiment on CGI based movies
Right mix of movies to release.
Should they focus on animation or Live action.
Do they need to reschedule any movie release in upcoming list.
Upcoming movies – how they should advertise.
from IPython.display import display, Image
Image("D:\Data_science\PHD\TVlicensereview.png")
Image("D:\Data_science\PHD\licensefee.png")
Producer company makes decent above 20% of total revenue in TV Telecast license for the movie.
They have to decide when they have to stop movies in theatres and issue TV license.
This is important decision to make.
Cleaning of Text:
Stop words, stemming, Lemmatize a. Using spacy b. Add customized stop words c. Use NLTK’s stemmer d. Tokenize e. Custom Stop words Removal f. Stemming
Remove stop words from clean text datasets and run models.
Lets webscrape Audience review from Rotten Tomato for The Lion King Movie
#Loading required Libraries
import requests
import time
import csv
import pandas as pd
import numpy as np
#Creating headers for our request
headers = {
'Referer': 'https://www.rottentomatoes.com/m/the_lion_king_2019/reviews?type=user',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
#API link to webscrape from RottenTomato
url = 'https://www.rottentomatoes.com/napi/movie/9057c2cf-7cab-317f-876f-e50b245ca76e/reviews/user'
#Initial payload parameters to fetch data
payload = {
'direction': 'next',
'endCursor': '',
'startCursor': '',
}
#Creating a Session Object with Rotten Tomato API
sess = requests.Session()
# To fetch one-page reviews by using GET
r = sess.get(url, headers=headers, params=payload) # GET Call
data = r.json()
#Creating Empty list for Page Info and Audience review to initiate Iteration
page_info = []
Audience_reveiws = []
# To get 6000 reviews, calling GET for 600 times.
for i in range(600):
update_start = data.get('pageInfo').get('startCursor')
update_end = data.get('pageInfo').get('endCursor')
payload.update({'startCursor':update_start})
payload.update({'endCursor':update_end})
r = sess.get(url, headers=headers, params=payload) # GET Call
data = r.json()
page_info.append(data.get('pageInfo'))
Audience_reveiws.append(data.get('reviews'))
time.sleep(5)
#To view received data
print(Audience_reveiws)
# Create empty data frame with Reviewer ID, Reviewer name, Review, Rating, Date_of_Review columns
# Assign column names with respective columns
col_names = ['ReviewID','Reviewer Name', 'Review', 'Rating' ,'Date_of_Review']
review_data = pd.DataFrame(columns = col_names)
# Add data to the data frame created
# len(Audience_reveiws) corresponds to total of 300 pages
for i in range(0, len(Audience_reveiws)):
#len(Audience_reveiws[0]) corresponds to 10 ratings which will be displayed in single page
for j in range(0,len(Audience_reveiws[0])):
review_data = review_data.append({'ReviewID':Audience_reveiws[i][j].get('user').get('userId'),
'Reviewer Name': Audience_reveiws[i][j].get('displayName'),
'Review': Audience_reveiws[i][j].get('review'),
'Rating': Audience_reveiws[i][j].get('score') ,
'Date_of_Review':Audience_reveiws[i][j].get('createDate')},
ignore_index=True)
#Check the shape of the review data
review_data.shape
#To view data with head of few lines
review_data.head()
# Export data into csv file from the data frame
review_data.to_csv("audience_review.csv", sep=',', columns=['ReviewID','Reviewer Name', 'Review', 'Rating' ,'Date_of_Review'], header=True, index=False)
#To Create New column "Sentiment" - If Rating is greater than 3, Positive Sentiment. If it is less than or equal to 3, Negative
review_data['Sentiment'] = np.where(review_data['Rating']>3, 'Pos','Neg')
#To view data with head of few lines to verify Sentiment column
review_data.head(3)
Lets us do some Explortory Data analysis and Data visualization
#Loading required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
#Reading data
data = pd.read_csv("audience_review.csv")
#To view data with head of few lines
data.head()
#To Create New column "Sentiment" - If Rating is greater than 3, Positive Sentiment. If it is less than or equal to 3, Negative
data['Sentiment'] = np.where(data['Rating']>3, 'Pos', 'Neg')
#To view data with head of few lines to verify Sentiment column
data.head()
data.shape
#First data collection - Imbalanced data
data.Sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%')
Extracted data had imbalance with less value count of Negative review. so, Extracted Data again with 6000 records and will select balance data of Positive and negative levels.
data= pd.DataFrame(data)
Postivie_review= data[data['Sentiment'] == 'Pos'].head(1500)
Negative_review= data[data['Sentiment'] == 'Neg'].head(1500)
Postivie_review.head(3)
Negative_review.head(3)
#Combine both the positive and negative reviews data
combine_data=pd.concat([Postivie_review, Negative_review])
#To check if combined data has Sentiment levels properly (Sentiment level 1)
combine_data.head(3)
#To check if combined data has Sentiment levels properly (Sentiment level 0)
combine_data.tail(3)
#Renaming to convienient name
data1=combine_data
#To check shape of the data
data1.shape
plot_size = plt.rcParams["figure.figsize"]
print(plot_size[0])
print(plot_size[1])
plot_size[0] = 12
plot_size[1] = 10
plt.rcParams["figure.figsize"] = plot_size
# Sentiment distribution plot
Sentiment = data1.Sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%', shadow=True)
#Rating distribution plot
data1.Rating.value_counts().plot(kind='pie', autopct='%1.0f%%', shadow=True)
#To check the count of Rating in different level/value
data1.Rating.value_counts()
#To check the count of Sentiment in different level/value
data1.Sentiment.value_counts()
#To create a function to view Text data
def print_plot(index):
example = data[data.index == index][['Review', 'Sentiment']].values[0]
if len(example) > 0:
print(example[0])
print('Sentiment:', example[1])
#Print to view sample Text data
print_plot(100)
#Converting Date to appropriate data type
data1.Date_of_Review= pd.to_datetime(data1.Date_of_Review)
#To check Date column changed data type
data1.head()
Lets us bring in some new features to explain about data clearly
#To create new feature - Weekday
data1['Weekday']=data1['Date_of_Review'].dt.weekday_name
#To view Weekday column
data1.head(3)
#To create new feature if the review is created on weekday or weekend
data1['dow'] = data1['Date_of_Review'].apply(lambda x: x.date().weekday())
data1['is_weekend'] = data1['Date_of_Review'].apply(lambda x: 1 if x.date().weekday() in (5, 6) else 0)
#To view Weekend column
data1.head(3)
#To view count of reivew for each day in Week
data1.Weekday.value_counts()
#Weekday distribution plot on Weekdays
data1.Weekday.value_counts().plot(kind='pie', autopct='%1.0f%%')
#To view count of review if it is on weekend or not
data1.is_weekend.value_counts()
#Plot to view count of review on weekend
data1.is_weekend.value_counts().plot(kind='pie', autopct='%1.0f%%')
#To view count of Positive review on weekend
data1[data1.Sentiment=='Pos'].is_weekend.value_counts(normalize=True)
#Plot to view count of Positive review on weekend
data1[data1.Sentiment=='Pos'].is_weekend.value_counts().plot(kind='pie', autopct='%1.0f%%')
#To view count of Negative review on weekend
data1[data1.Sentiment=='Neg'].is_weekend.value_counts(normalize=True)
#Plot to view count of Negative review on weekend
data1[data1.Sentiment=='Neg'].is_weekend.value_counts().plot(kind='pie', autopct='%1.0f%%')
#To view count of Positive review on each day of the week
data1[data1.Sentiment=='Neg'].Weekday.value_counts(normalize=True)
#Plot to view count of Positive review on each day of the week
data1[data1.Sentiment=='Neg'].Weekday.value_counts().plot(kind='pie', autopct='%1.0f%%')
#To view data and understand it.
data1.head(3)
#Dropping few of the columns which are unneccessary for analysis
data2= data1.drop(['Reviewer Name', 'Rating', 'Date_of_Review'], axis=1)
#To check the data after dropping few columns
data2.head(3)
#To check shape of the data
data2.shape
#To take backup of the modified data to use it in future.
data2.to_csv("data2.csv", sep=',', columns=['ReviewID', 'Review','Sentiment'], header=True, index=False)
#Exporting review text to Text file - just as a backup file
data2['Review'].to_csv("review.txt", sep=',', columns=['Review',], header=True, index=False)
Audience review has mix of special characters, numbers and bad symbols.
It needs to be cleaned before we can use review text for classification models.
#To read file from backup.
data2 = pd.read_csv("data2.csv")
#Text cleaning code
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') #Removing special symbols
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') #Removing bad symabols
STOPWORDS = set(stopwords.words('english')) #Removing stop words
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
# text = re.sub(r'\W+', '', text)
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
return text
data2['Review'] = data2['Review'].apply(clean_text)
#To take back up of clean review text - just to view
data2['Review'].to_csv("cleanreview.txt", sep=',', columns=['Review',], header=True, index=False)
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 2000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 500
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data2['Review'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
#Tokenizing text data and Padding sequence
X = tokenizer.texts_to_sequences(data2['Review'])
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)
Y = pd.get_dummies(data2['Sentiment'])
print('Shape of label tensor:', Y.shape)
#Train-Test split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 432)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
#Building Model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(100, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#Fitting model
%time
epochs = 3
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)
#Model Evaluation
accr = model.evaluate(X_test,Y_test)
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0],accr[1]))
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();
new_Review = ['Awesome movie I love how close it was to the original film absolutely amazing ❤❤❤']
seq = tokenizer.texts_to_sequences(new_Review)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['1','0']
print(pred, labels[np.argmax(pred)])
new_Review = ['Its the same move years ago. This time boring. Even my grandsons didnt like it.']
seq = tokenizer.texts_to_sequences(new_Review)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['1','0']
print(pred, labels[np.argmax(pred)])
LSTM model is predicting correctly to 2 unseen data.
will predict on unseen test data to check performance
# Loading Unseen Test data
Unseen_test = pd.read_csv("test-1566619745327.csv")
Unseen_test.head(3)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') #Removing special symbols
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') #Removing bad symabols
STOPWORDS = set(stopwords.words('english')) #Removing stop words
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = text.lower() # lowercase text
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
# text = re.sub(r'\W+', '', text)
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
return text
Unseen_test['review'] = Unseen_test['review'].apply(clean_text)
#Tokenizing text data and Padding sequence
X = tokenizer.texts_to_sequences(Unseen_test['review'])
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)
seq = tokenizer.texts_to_sequences(Unseen_test['review'])
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred_unseen = model.predict(padded)
labels = ['0','1']
lstm_output = labels[np.argmax(pred_unseen)]
print(pred_unseen)
pred_unseen[0]
pred_unseen[1199]
lstm_output = []
#Getting max of Levels
labels = ['0','1']
i=0
for i in range(0,1199):
lstm_output.append(labels[np.argmax(pred_unseen[i])])
lstm_OUT = pd.DataFrame(lstm_output)
#Exporting LSTM model output to CSV file.
lstm_OUT.to_csv("lstm_output.csv")
Overfitted Model
Lets understand some insights from Text/Audience review
#To create word count feature with taking length of text split words
data2['word_count'] = [len(text.split(' ')) for text in data2['Review']]
data2.head(3)
## Getting the first quartile value
q1 = np.percentile(data2.word_count,25)
print(f"The first quartile value of words_count attribute is {q1}")
## Getting the second quartile value
q2 = np.percentile(data2.word_count,50)
print(f"The first quartile value of words_count attribute is {q2}")
## Getting the Third quartile value
q3 = np.percentile(data2.word_count,75)
print(f"The first quartile value of words_count attribute is {q3}")
## Getting the 90% value
q90 = np.percentile(data2.word_count,90)
print(f"The first quartile value of words_count attribute is {q90}")
labels = ['q1', 'q2', 'q3', 'q90']
sizes = [5,9,17,34]
patches= plt.bar(x=labels, height=sizes, width=0.2)
plt.legend(patches, labels, loc="best")
plt.tight_layout()
plt.show()
# Loading Spacy library
import spacy
nlp = spacy.load("en_core_web_sm")
## load spacy's English stopwords as variable called 'stopwords'
stopwords = spacy.lang.en.stop_words.STOP_WORDS
print('Number of stop words: %d' % len(stopwords))
print('First ten stop words: %s' % list(stopwords)[:10])
## load nltk's SnowballStemmer as variable 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
# Here I define a tokenizer and stemmer which returns the set of stems (excluding stop words) in the text that it is passed
def tokenize_and_stem(doc, remove_stopwords = True):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
if remove_stopwords:
tokens = [word.text for word in doc if not word.is_stop]
else:
tokens = [word.text for word in doc]
#print(tokens[:5])
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
#print("ended re.search")
stems = [stemmer.stem(t) for t in filtered_tokens]
#print("returning stems")
return stems
def tokenize_and_lemmatize(doc, remove_stopwords = True):
# spaCy will convert word to lower case and changing past tense,
# gerund form (other tenses as well) to present tense. Also, “they” normalize to “-PRON-” which is pronoun.
if remove_stopwords:
tokens = [word for word in doc if not word.is_stop]
else:
tokens = [word for word in doc]
#print("Completed tokenization")
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token.text):
filtered_tokens.append(token)
#print("ended re.search")
lemma = [t.lemma_ for t in filtered_tokens]
#print("returning stems")
return lemma
def tokenize_only(doc, remove_stopwords = True):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
if remove_stopwords:
tokens = [word.text for word in doc if not word.is_stop]
else:
tokens = [word.text for word in doc]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
#Converting to dictory
data2 = data2.reset_index().to_dict(orient='list')
## We are trying to create four seperate lists for text with stop words, text without stop words,
## text with stemmed words and text with lemmatized words.
## Naming Conventions followed ####
## 'clean' word is appended to lists which do not contain stopwords
## 'all' keyword is appended to lists which contain stopwords.
## use extend so it's a big flat list of vocab
data2['clean_text_stemmed'] = []
data2['clean_text_lemmatized'] = []
data2['text_stemmed'] = []
data2['text_lemmatized'] = []
vocab_stemmed = []
vocab_tokenized = []
allvocab_tokenized = []
vocab_lemmatized = []
allvocab_lemmatized = []
for idx,text in enumerate(data2['Review']):
## first convert the entire text into spacy document type
# print(f"The type of text is {type(text)} and text is {text}")
# print(f"The type of idx is {type(idx)} and idx is {idx}")
doc = nlp(text)
print(f"processing {idx} document")
words_stemmed = tokenize_and_stem(doc)
words_lemmatized = tokenize_and_lemmatize(doc)
vocab_stemmed.extend(words_stemmed)
vocab_lemmatized.extend(words_lemmatized)
data2['clean_text_stemmed'].append(words_stemmed)
data2['clean_text_lemmatized'].append(words_lemmatized)
allwords_stemmed = tokenize_and_stem(doc, False)
allwords_lemmatized = tokenize_and_lemmatize(doc, False)
allvocab_lemmatized.extend(allwords_lemmatized)
data2['text_stemmed'].append(allwords_stemmed)
data2['text_lemmatized'].append(allwords_lemmatized)
allwords_tokenized = tokenize_only(doc,False)
allvocab_tokenized.extend(allwords_tokenized)
words_tokenized = tokenize_only(doc)
vocab_tokenized.extend(words_tokenized)
#Creating Vocab frame list of words
all_vocab_frame = pd.DataFrame({'words': allvocab_tokenized}, index = allvocab_lemmatized)
print ('there are ' + str(all_vocab_frame.shape[0]) + ' items in all_vocab_frame')
vocab_frame = pd.DataFrame({'words': vocab_tokenized}, index = vocab_lemmatized)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
print (vocab_frame.head(20))
#To take unique vocab words
values, counts = np.unique(vocab_frame, return_counts=True)
all_values, all_counts = np.unique(all_vocab_frame, return_counts=True)
#To sort the vocab words
sorted_indices = np.argsort(-counts)
print(sorted_indices)
all_sorted_indices = np.argsort(-all_counts)
print(all_sorted_indices)
# To view the vocab words with counts
values = values[sorted_indices]
counts = counts[sorted_indices]
all_values = all_values[all_sorted_indices]
all_counts = all_counts[all_sorted_indices]
#Plot to print Frequencey words
font = {'weight' : 'bold',
'size' : 50}
plt.rc('font', **font)
fig = plt.figure(figsize=(70,70))
plt.barh(values[:75], counts[:75])
plt.gca().invert_yaxis()
plt.show()
#Import Stop words library
from spacy.lang.en.stop_words import STOP_WORDS
# Add word Movie to stop words - default list
stopwords1 = ['movie'] + list(STOP_WORDS)
print(stopwords1[:10],"\n\n")
#Plot to print Frequencey words
font = {'weight' : 'bold',
'size' : 50}
plt.rc('font', **font)
fig = plt.figure(figsize=(70,70))
plt.barh(values[:50], counts[:50])
plt.gca().invert_yaxis()
plt.line(x=200)
plt.show()
from wordcloud import WordCloud
#Plot to get wordcloud
wordcloud = WordCloud().generate(data2['Review'][500])
import matplotlib.pyplot as plt
%matplotlib inline
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
Animation and amazing are visible from word cloud.
from IPython.display import display, Image
Image("D:\Data_science\PHD\keywords.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\simbatshirt.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\hakunamatata.png")
We have two ways to vectorize the words into matrix form
Exploring TFIDF vectorizer further with 2 more options with text data,
## tfidf vectorizer needs sentence and not token. Hence we need to combine all the tokens back to form a string
data2['clean_text_stemmed'] = [' '.join(text) for text in data2['clean_text_stemmed']]
data2['clean_text_lemmatized'] = [' '.join(text) for text in data2['clean_text_lemmatized']]
data2['text_lemmatized'] = [' '.join(text) for text in data2['text_lemmatized']]
#To create new variables for Lemmatized clean text and Lemmatized non stop word removal text
cleantext_lemma = data2['clean_text_lemmatized']
text_lemma_nsw = data2['text_lemmatized']
from sklearn.feature_extraction.text import TfidfVectorizer
#define vectorizer parameters for cleantext_lemma
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=7000,
min_df=0.001,
use_idf=True, ngram_range=(1,3))
tfidf_matrix_clntxt = tfidf_vectorizer.fit_transform(cleantext_lemma)
print(tfidf_matrix_clntxt.shape)
#Converting sparse data to dense form
TV_Mat_clntxt = tfidf_matrix_clntxt.todense()
TV_Mat_clntxt
#For TFid vector - converting to dataframe
TV_Mat_clntxt = pd.DataFrame(TV_Mat_clntxt)
TV_Mat_clntxt.head()
#Define the Target Variable
TV_Mat_clntxt['Sentiment'] = ['0']*1500+['1']*1500
#define vectorizer parameters for text_lemma_nsw
#define vectorizer parameters for cleantext_lemma
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=7000,
min_df=0.001,
use_idf=True, ngram_range=(1,3))
tfidf_matrix_nsw = tfidf_vectorizer.fit_transform(text_lemma_nsw)
print(tfidf_matrix_nsw.shape)
#Converting sparse data to dense form
TV_Mat_nsw = tfidf_matrix_nsw.todense()
TV_Mat_nsw
#For TFid vector - with non stop word removal text
TV_Mat_nsw = pd.DataFrame(TV_Mat_nsw)
TV_Mat_nsw.head()
#Define the Target Variable
TV_Mat_nsw['Sentiment'] = ['0']*1500+['1']*1500
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
#define count vectorizer parameters for cleantext_lemma
cv=CountVectorizer(stop_words='english',lowercase=True,
strip_accents='unicode',decode_error='ignore')
tdm = cv.fit_transform(data2['clean_text_lemmatized'])
tdm
#Converting sparse data to dense form
Matrix = tdm.todense()
Matrix
#For Counter vectorizer
Mat = pd.DataFrame(Matrix)
Mat.head()
#Define the Target Variable
Mat['Sentiment'] = ['0']*1500+['1']*1500
Mat.head()
#Loading required Libraries for Clustering
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import joblib
Sum_of_squared_distances = []
K = range(1,10)
for k in K:
kmeanModel = KMeans(n_clusters=k, random_state=143)
kmeanModel.fit(tfidf_matrix_nsw)
Sum_of_squared_distances.append(kmeanModel.inertia_)
## Plot the elbow
font = {'weight' : 'bold',
'size' : 10}
plt.rc('font', **font)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
We shall ideally build clustering with K=4.
with interest to get insight on binary class clustering, we will build cluster with K=2
#Building Binary class cluster to represent Positive and Negative Sentiment.
num_clusters = 2
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix_nsw)
#km.labels_
clusters = km.labels_.tolist()
#km.cluster_centers
centers = km.cluster_centers_
print(f"the cluster centers are {centers}")
joblib.dump(km, 'doc_cluster_best_K.pkl')
#To view cluster center/Mean values
print(km.cluster_centers_)
print(km.cluster_centers_.shape)
#To sort cluster center in order.
km.cluster_centers_.argsort()
## Reversing the list so that index of max element is in 0th index
km.cluster_centers_.argsort()[:,::-1]
#To create cluster group feature in our data
data2['cluster_group'] = clusters
data2.pop('clean_text', None)
pd.DataFrame(data2).head(5)
#To create new Dataframe to proceed with few more steps
cluster_df = pd.DataFrame(data2)
#To view count of levels in each cluster
cluster_df['cluster_group'].value_counts()
##Step 1
cluster_df['tokenized_text'] = [text.split(' ') for text in cluster_df['text_lemmatized']]
##Step 2
grouped_text = cluster_df.groupby('cluster_group')['tokenized_text']
## Fetch entire tokenized text for specific group
grouped_text.get_group(0)
from itertools import chain
frequent_words_df = pd.DataFrame(columns={"values", "counts", "cluster_id"})
for num in range(num_clusters):
values, counts = np.unique(list(chain.from_iterable(grouped_text.get_group(num))), return_counts=True)
sorted_indices = np.argsort(-counts)
frequent_words_df = frequent_words_df.append({"values":values[sorted_indices], "counts":counts[sorted_indices], "cluster_id": num}, ignore_index=True)
#To view head of frequenct value dataframe
frequent_words_df.head()
#Plot to get frequency words in different levels.
font = {'weight' : 'bold',
'size' : 70}
plt.rc('font', **font)
fig = plt.figure(figsize=(100,100))
plt.subplot(2,2,1)
plt.barh(frequent_words_df.loc[0,'values'][:30], frequent_words_df.loc[0,'counts'][:30])
plt.gca().invert_yaxis()
plt.subplot(2,2,2)
plt.barh(frequent_words_df.loc[1,'values'][:30], frequent_words_df.loc[1,'counts'][:30])
plt.gca().invert_yaxis()
from IPython.display import display, Image
Image("D:\Data_science\PHD\wordcamparision.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\ClusteringComparisionReport.png")
40% of the Audience compared Lion King 2019 movie with Original movie 1994.
62% of Audience gave positive sentiment on Movie while 38% of the audience gave negative sentiment
12% of the audience talked about story of the movie.
69% of audience are happy with story line while 31% of audience are not so happy.
13% of audience commented about remake version.
61% of audience felt excited about remake version while 39% of audience expectation was not met.
11% of audience expressed about Voice over in the movie.
53% of audience felt pleasant. 47% of audience expressed it as disguesting.
Especially female voice over character should have been better.
10% of audience wrote comment on Disney.
60% approve of the Disney's make while 40% disapprove of it.
90% of audience who wrote comment expressed WOW feel for animation.
However 10% felt it could have been better.
Overall, Thumbsup for Animation !!
8% of audience expressed about CGI - Animated characters.
70% of audience are exclaimed about how it was made.
30% of audience expected level did not meet.
10% of the audience explained about emotional mix in the movie.
70% of audience felt good as movie portary Pride, Bravery and life cycle etc.
30% of audience felt scary for little kids, rude in few of the scenes.
28% of audience expressed strong negative sentiments on movie.
5% of audience felt few of the scenes are scary, brave, emotional.
# Train Test split_TV - Cleantext Lemma
TV_Mat_clntxt = TV_Mat_clntxt.sample(frac = 1,random_state=1234)
train_tv_clntext = TV_Mat_clntxt.iloc[:2400]
test_tv_clntext = TV_Mat_clntxt.iloc[2400:]
#To get train test split for both X and Y
X_train_TV_clntxt=train_tv_clntext.iloc[:,:-1]#from col1 to collast, except last one (Excl target) slicing
Y_train_TV_clntxt=train_tv_clntext.iloc[:,-1]#selecting target col
X_test_TV_clntxt=test_tv_clntext.iloc[:,:-1]#from col1 to collast, except last one (Excl target) slicing
Y_test_TV_clntxt=test_tv_clntext.iloc[:,-1]#selecting target col
# Train Test split_TV - Non stop word removal word
TV_Mat_nsw = TV_Mat_nsw.sample(frac = 1,random_state=4321)
train_tv_nsw = TV_Mat_nsw.iloc[:2400]
test_tv_nsw = TV_Mat_nsw.iloc[2400:]
#To get train test split for both X and Y
X_train_TV_nsw=train_tv_nsw.iloc[:,:-1]#from col1 to collast, except last one (Excl target) slicing
Y_train_TV_nsw=train_tv_nsw.iloc[:,-1]#selecting target col
X_test_TV_nsw=test_tv_nsw.iloc[:,:-1]#from col1 to collast, except last one (Excl target) slicing
Y_test_TV_nsw=test_tv_nsw.iloc[:,-1]#selecting target col
# Train Test split_count vectorizer
Mat = Mat.sample(frac = 1,random_state=1234)
train_cv = Mat.iloc[:2400]
test_cv = Mat.iloc[2400:]
#To get train test split for both X and Y
X_train_CV=train_cv.iloc[:,:-1]#from col1 to collast, except last one (Excl target) slicing
Y_train_CV=train_cv.iloc[:,-1]#selecting target col
X_test_CV=test_cv.iloc[:,:-1]#from col1 to collast, except last one (Excl target) slicing
Y_test_CV=test_cv.iloc[:,-1]#selecting target col
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
logreg = LogisticRegression()
logreg.fit(X_train_CV,Y_train_CV)
#Predictions on test data
lr_pred_train=logreg.predict(X_train_CV)
lr_pred=logreg.predict(X_test_CV)
# Test data confusion Matrix
confusion_matrix_lr = confusion_matrix(Y_test_CV,lr_pred)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_CV,lr_pred_train))
print("\n Traindata f1-score for class '1'",f1_score(Y_train_CV,lr_pred_train, average='weighted', pos_label='1' ))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_lr)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_CV,lr_pred))
print("\nTest data f1-score for class '1'",f1_score(Y_test_CV,lr_pred, average='weighted', pos_label='1' ))
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB()
NB.fit(X_train_CV,Y_train_CV)
# Predictions on test data
NB_pred=NB.predict(X_train_CV)
# Predictions on test data
NB_pred=NB.predict(X_test_CV)
# Test data confusion Matrix
confusion_matrix_NB = confusion_matrix(Y_test_CV,NB_pred)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_CV,lr_pred_train))
print("\n Traindata f1-score for class '1'",f1_score(Y_train_CV,lr_pred_train, average='weighted', pos_label='1'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_NB)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_CV,NB_pred))
print("\nTest data f1-score for class '1'",f1_score(Y_test_CV,NB_pred, average='weighted' ))
## Build a SVM Classifier
from sklearn.svm import SVC
## Create an SVC object and print it to see the default arguments
svc = SVC()
svc
## Fit
svc_cv = svc.fit(X_train_CV, Y_train_CV)
# Predictions on train data
svm_pred_cv_train=svc_cv.predict(X_train_CV)
# Predictions on test data
svm_pred_cv=svc_cv.predict(X_test_CV)
confusion_matrix_test_svm_cv= confusion_matrix(Y_test_CV, svm_pred_cv)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("\Train DATA ACCURACY",accuracy_score(Y_train_CV, svm_pred_cv_train))
print("\n Train data f1-score for class '1'",f1_score(Y_train_CV, svm_pred_cv_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_test_svm_cv)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_CV, svm_pred_cv))
print("\nTest data f1-score for class '1'",f1_score(Y_test_CV, svm_pred_cv, average='weighted'))
SVM model gives worst accuracy.
from sklearn.tree import DecisionTreeClassifier
Dtc = DecisionTreeClassifier()
# Build Model
%time Dtc.fit(X_train_CV, Y_train_CV)
# Predictions on test data
DTC_pred_CV_train=Dtc.predict(X_train_CV)
DTC_pred_CV=Dtc.predict(X_test_CV)
confusion_matrix_test_dtc_cv= confusion_matrix(Y_test_CV,DTC_pred_CV)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("\Train DATA ACCURACY",accuracy_score(Y_train_CV, DTC_pred_CV_train))
print("\n Train data f1-score for class '1'",f1_score(Y_train_CV, DTC_pred_CV_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_test_dtc_cv)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_CV,DTC_pred_CV))
print("\nTest data f1-score for class '1'",f1_score(Y_test_CV,DTC_pred_CV, average='weighted'))
from sklearn.ensemble import RandomForestClassifier
Rf = RandomForestClassifier()
# Build Model
%time Rf.fit(X_train_CV, Y_train_CV)
# Predictions on test data
RF_pred_CV_train=Dtc.predict(X_train_CV)
RF_pred_CV=Dtc.predict(X_test_CV)
confusion_matrix_test_rf_cv= confusion_matrix(Y_test_CV,DTC_pred_CV)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("\Train DATA ACCURACY",accuracy_score(Y_train_CV, RF_pred_CV_train))
print("\n Train data f1-score for class '1'",f1_score(Y_train_CV, RF_pred_CV_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_test_rf_cv)
print("\Train DATA ACCURACY",accuracy_score(Y_test_CV, RF_pred_CV))
print("\n Train data f1-score for class '1'",f1_score(Y_test_CV, RF_pred_CV, average='weighted'))
from sklearn.ensemble import GradientBoostingClassifier
Gbm = GradientBoostingClassifier()
# Build Model
%time Gbm.fit(X_train_CV, Y_train_CV)
# Predictions on test data
Gbm_pred_CV_train=Gbm.predict(X_train_CV)
Gbm_pred_CV=Gbm.predict(X_test_CV)
confusion_matrix_test_gbm_cv= confusion_matrix(Y_test_CV,Gbm_pred_CV)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("\Train DATA ACCURACY",accuracy_score(Y_train_CV, Gbm_pred_CV_train))
print("\n Train data f1-score for class '1'",f1_score(Y_train_CV, Gbm_pred_CV_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_test_gbm_cv)
print("\Train DATA ACCURACY",accuracy_score(Y_test_CV, Gbm_pred_CV))
print("\n Train data f1-score for class '1'",f1_score(Y_test_CV, Gbm_pred_CV, average='weighted'))
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=0.4)
logreg.fit(X_train_TV_nsw,Y_train_TV_nsw)
#Predictions on train data
lr_pred_tv_train_nsw=logreg.predict(X_train_TV_nsw)
#Predictions on test data
lr_pred_tv_test_nsw=logreg.predict(X_test_TV_nsw)
# Train data confusion Matrix
confusion_matrix_lr_tv_train_nsw = confusion_matrix(Y_train_TV_nsw,lr_pred_tv_train_nsw)
# Test data confusion Matrix
confusion_matrix_lr_tv_test_nsw = confusion_matrix(Y_test_TV_nsw,lr_pred_tv_test_nsw)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("Train Conf Matrix : \n", confusion_matrix_lr_tv_train_nsw)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_nsw,lr_pred_tv_train_nsw))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_nsw,lr_pred_tv_train_nsw, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_lr_tv_test_nsw)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_nsw,lr_pred_tv_test_nsw))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_nsw,lr_pred_tv_test_nsw, average='weighted'))
# predict probabilities
probs = logreg.predict_proba(X_test_TV_nsw)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# predict class values
yhat = logreg.predict(X_test_TV_nsw)
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(Y_test_TV_nsw, probs, pos_label='1')
# calculate F1 score
f1 = f1_score(Y_test_TV_nsw, yhat, average='weighted')
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# calculate roc curve
fpr, tpr, thresholds = roc_curve(Y_test_TV_nsw, probs, pos_label='1')
# plot no skill
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the precision-recall curve for the model
pyplot.plot(fpr, tpr, marker='.')
# show the plot
pyplot.show()
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=0.5)
logreg.fit(X_train_TV_clntxt,Y_train_TV_clntxt)
#Predictions on train data
lr_pred_tv_train_clntxt =logreg.predict(X_train_TV_clntxt)
#Predictions on test data
lr_pred_tv_test_clntxt=logreg.predict(X_test_TV_clntxt)
# Train data confusion Matrix
confusion_matrix_lr_tv_train_clntxt = confusion_matrix(Y_train_TV_clntxt,lr_pred_tv_train_clntxt)
# Test data confusion Matrix
confusion_matrix_lr_tv_test_clntxt = confusion_matrix(Y_test_TV_clntxt,lr_pred_tv_test_clntxt)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("Train Conf Matrix : \n", confusion_matrix_lr_tv_train_clntxt)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,lr_pred_tv_train_clntxt))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_clntxt,lr_pred_tv_train_clntxt, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_lr_tv_test_clntxt)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_clntxt,lr_pred_tv_test_clntxt))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_clntxt,lr_pred_tv_test_clntxt, average='weighted'))
lr_pred_unseen = logreg.predict(TV_Mat_unseen)
from sklearn.naive_bayes import MultinomialNB
NB = MultinomialNB(alpha=0.4, fit_prior=True)
NB.fit(X_train_TV_clntxt,Y_train_TV_clntxt)
# Predictions on train data
NB_pred_train=NB.predict(X_train_TV_clntxt)
confusion_matrix_NB_train= confusion_matrix(Y_train_TV_clntxt,NB_pred_train)
# Predictions on test data
NB_pred_test=NB.predict(X_test_TV_clntxt)
confusion_matrix_NB_test= confusion_matrix(Y_test_TV_clntxt,NB_pred_test)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("Train Conf Matrix : \n", confusion_matrix_NB_train)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,NB_pred_train))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_clntxt,NB_pred_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_NB_test)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_clntxt,NB_pred_test))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_clntxt,NB_pred_test, average='weighted'))
NB_pred_unseen=NB.predict(TV_Mat_unseen)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
Dtc = DecisionTreeClassifier()
def dtc_params_best(X, y, nfolds):
criterion = ['entropy', 'gini']
max_depth = [6,8,10,12]
min_samples_split = [2,4,6,8,10,20]
min_samples_leaf = [2, 4, 6]
param_grid = {'criterion': ['entropy', 'gini'], 'max_depth': [6,8,10,12], 'min_samples_split': [2, 10, 20], 'min_samples_leaf': [2, 4, 6]}
grid_search = GridSearchCV(Dtc, param_grid,cv=nfolds)
grid_search.fit(X, y)
grid_search.best_params_
return grid_search.best_params_
val=dtc_params_best(X_train_TV_clntxt, Y_train_TV_clntxt,5)
val
Dtc_val = DecisionTreeClassifier(criterion ='gini', max_depth= 12, min_samples_leaf =2, min_samples_split= 2)
Dtc_val.fit(X_train_TV_clntxt,Y_train_TV_clntxt)
dtc_pred_train = Dtc_val.predict(X_train_TV_clntxt)
dtc_pred_test = Dtc_val.predict(X_test_TV_clntxt)
print(Dtc_val.score(X_train_TV_clntxt, Y_train_TV_clntxt))
print(Dtc_val.score(X_test_TV_clntxt, Y_test_TV_clntxt))
confusion_matrix_DT_train =confusion_matrix(Y_train_TV_clntxt, dtc_pred_train)
confusion_matrix_DT_test= confusion_matrix(Y_test_TV_clntxt, dtc_pred_test)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("Train Conf Matrix : \n", confusion_matrix_DT_train)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,dtc_pred_train))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_clntxt,dtc_pred_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_DT_test)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_clntxt,dtc_pred_test))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_clntxt,dtc_pred_test, average='weighted'))
dtc_pred_unseen = Dtc_val.predict(TV_Mat_unseen)
from sklearn.ensemble import RandomForestClassifier
Rf = RandomForestClassifier(n_estimators=500, max_depth=12, criterion='entropy')
Rf.fit(X_train_TV_clntxt,Y_train_TV_clntxt)
rf_pred_train = Rf.predict(X_train_TV_clntxt)
rf_pred_test = Rf.predict(X_test_TV_clntxt)
confusion_matrix_RF_train =confusion_matrix(Y_train_TV_clntxt, rf_pred_train)
confusion_matrix_RF_test= confusion_matrix(Y_test_TV_clntxt, rf_pred_test)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("Train Conf Matrix : \n", confusion_matrix_RF_train)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,rf_pred_train))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_clntxt,rf_pred_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_RF_test)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_clntxt,rf_pred_test))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_clntxt,rf_pred_test, average='weighted'))
from sklearn.ensemble import RandomForestClassifier
Rf = RandomForestClassifier(n_estimators=500, max_depth=12, criterion='gini')
Rf.fit(X_train_TV_nsw,Y_train_TV_nsw)
rf_pred_train = Rf.predict(X_train_TV_nsw)
rf_pred_test = Rf.predict(X_test_TV_nsw)
confusion_matrix_RF_train =confusion_matrix(Y_train_TV_nsw, rf_pred_train)
confusion_matrix_RF_test= confusion_matrix(Y_test_TV_nsw, rf_pred_test)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("Train Conf Matrix : \n", confusion_matrix_RF_train)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_nsw,rf_pred_train))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_nsw,rf_pred_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_RF_test)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_nsw,rf_pred_test))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_nsw,rf_pred_test, average='weighted'))
rf_pred_unseen = Rf.predict(TV_Mat_unseen)
## Build a SVM Classifier
from sklearn.svm import SVC, LinearSVC
## Create an SVC object and print it to see the default arguments
svc = LinearSVC()
svc
## Fit
svc_tv = svc.fit(X_train_TV_clntxt, Y_train_TV_clntxt)
# Predictions on test data
svm_pred_tv_train=svc_tv.predict(X_train_TV_clntxt)
svm_pred_tv_test=svc_tv.predict(X_test_TV_clntxt)
confusion_matrix_train_svm= confusion_matrix(Y_train_TV_clntxt,svm_pred_tv_train)
confusion_matrix_test_svm= confusion_matrix(Y_test_TV_clntxt,svm_pred_tv_test)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_train_svm)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,svm_pred_tv_train))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_clntxt,svm_pred_tv_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_test_svm)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_clntxt,svm_pred_tv))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_clntxt,svm_pred_tv, average='weighted'))
GradientBoostingClassifier(criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_iter_no_change=None, presort='auto',
random_state=None, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
from sklearn.ensemble import GradientBoostingClassifier
GBM = GradientBoostingClassifier(learning_rate=0.12, max_depth=3, n_estimators=200, max_features=0.2, subsample=0.8)
%time GBM.fit(X_train_TV_clntxt,Y_train_TV_clntxt)
GBM_pred_train = GBM.predict(X_train_TV_clntxt)
GBM_pred_test = GBM.predict(X_test_TV_clntxt)
confusion_matrix_GBM_train =confusion_matrix(Y_train_TV_clntxt, GBM_pred_train)
confusion_matrix_GBM_test= confusion_matrix(Y_test_TV_clntxt, GBM_pred_test)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("Train Conf Matrix : \n", confusion_matrix_GBM_train)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,GBM_pred_train))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_clntxt,GBM_pred_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_GBM_test)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_clntxt,GBM_pred_test))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_clntxt,GBM_pred_test, average='weighted'))
# predict probabilities
probs = logreg.predict_proba(X_test_TV_clntxt)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# predict class values
yhat = logreg.predict(X_test_TV_clntxt)
# calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(Y_test_TV_clntxt, probs, pos_label='1')
# calculate F1 score
f1 = f1_score(Y_test_TV_clntxt, yhat, average='weighted')
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# calculate roc curve
fpr, tpr, thresholds = roc_curve(Y_test_TV_clntxt, probs, pos_label='1')
# plot no skill
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot the precision-recall curve for the model
pyplot.plot(fpr, tpr, marker='.')
# show the plot
pyplot.show()
#Converting to dictory
Unseen_test = Unseen_test.reset_index().to_dict(orient='list')
## We are trying to create four seperate lists for text with stop words, text without stop words,
## text with stemmed words and text with lemmatized words.
## Naming Conventions followed ####
## 'clean' word is appended to lists which do not contain stopwords
## 'all' keyword is appended to lists which contain stopwords.
## use extend so it's a big flat list of vocab
Unseen_test['clean_text_stemmed'] = []
Unseen_test['clean_text_lemmatized'] = []
Unseen_test['text_stemmed'] = []
Unseen_test['text_lemmatized'] = []
vocab_stemmed = []
vocab_tokenized = []
allvocab_tokenized = []
vocab_lemmatized = []
allvocab_lemmatized = []
for idx,text in enumerate(Unseen_test['review']):
## first convert the entire text into spacy document type
# print(f"The type of text is {type(text)} and text is {text}")
# print(f"The type of idx is {type(idx)} and idx is {idx}")
doc = nlp(text)
print(f"processing {idx} document")
words_stemmed = tokenize_and_stem(doc)
words_lemmatized = tokenize_and_lemmatize(doc)
vocab_stemmed.extend(words_stemmed)
vocab_lemmatized.extend(words_lemmatized)
Unseen_test['clean_text_stemmed'].append(words_stemmed)
Unseen_test['clean_text_lemmatized'].append(words_lemmatized)
allwords_stemmed = tokenize_and_stem(doc, False)
allwords_lemmatized = tokenize_and_lemmatize(doc, False)
allvocab_lemmatized.extend(allwords_lemmatized)
Unseen_test['text_stemmed'].append(allwords_stemmed)
Unseen_test['text_lemmatized'].append(allwords_lemmatized)
allwords_tokenized = tokenize_only(doc,False)
allvocab_tokenized.extend(allwords_tokenized)
words_tokenized = tokenize_only(doc)
vocab_tokenized.extend(words_tokenized)
## tfidf vectorizer needs sentence and not token. Hence we need to combine all the tokens back to form a string
Unseen_test['clean_text_stemmed'] = [' '.join(text) for text in Unseen_test['clean_text_stemmed']]
Unseen_test['clean_text_lemmatized'] = [' '.join(text) for text in Unseen_test['clean_text_lemmatized']]
# #define vectorizer parameters
# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=5000,
# min_df=0.001,
# use_idf=True, ngram_range=(1,5))
tfidf_matrix_unseen = tfidf_vectorizer.transform(Unseen_test['clean_text_lemmatized'])
print(tfidf_matrix_unseen.shape)
TV_Matrix_unseen = tfidf_matrix_unseen.todense()
TV_Matrix_unseen
#For TFid vector
TV_Mat_unseen = pd.DataFrame(TV_Matrix_unseen)
TV_Mat_unseen.head()
GBM_pred_unseen = GBM.predict(TV_Mat_unseen)
GBM_pred_unseen = pd.DataFrame(GBM_pred_unseen)
GBM_pred_unseen.head(3)
#Exporting LSTM model output to CSV file.
GBM_pred_unseen.to_csv("GBM_output.csv")
from IPython.display import display, Image
Image("D:\Data_science\PHD\Hyperparametertuning.png")
Max_features should be low - Typically 0.2-0.3
Subsampling should be high - Typically 0.6-0.8
N-estimators should be low - Typically 100-300
Max_depth should be low - Typically 3-8
Learning rate should be low - Typically 0.1 - 0.01
Max_features should be high - Typically 0.4-0.6
Subsampling should be low - Typically 0.3-0.5
N-estimators should be low - Typically 500-1000
Max_depth should be low - Typically 10-20
Learning rate should be low - Typically 0.1 - 1
!pip install xgboost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
eval_set = [(X_train_TV_clntxt, Y_train_TV_clntxt), (X_test_TV_clntxt, Y_test_TV_clntxt)]
# fit model on training data
XG = XGBClassifier(max_depth=8,learning_rate=0.05, n_estimators=100, subsample=0.8, reg_alpha=0.6, reg_lambda=0.6, gamma=10, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=True)
%time XG.fit(X_train_TV_clntxt,Y_train_TV_clntxt)
XG_pred_train = XG.predict(X_train_TV_clntxt)
XG_pred_test = XG.predict(X_test_TV_clntxt)
confusion_matrix_XG_train =confusion_matrix(Y_train_TV_clntxt, XG_pred_train)
confusion_matrix_XG_test= confusion_matrix(Y_test_TV_clntxt, XG_pred_test)
### Train data accuracy
print("\n\n--------------------------------------\n\n")
confusion_matrix_XG_train
print("Train Conf Matrix : \n", confusion_matrix_XG_train)
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,XG_pred_train))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_clntxt,XG_pred_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST Conf Matrix : \n", confusion_matrix_XG_test)
print("\nTEST DATA ACCURACY",accuracy_score(Y_test_TV_clntxt,XG_pred_test))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_clntxt,XG_pred_test, average='weighted'))
XG_pred_train
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D
input_shape = TV_Mat_clntxt.shape
modelcnn= Sequential()
modelcnn.add(Embedding(20000, 100, input_length=500))
modelcnn.add(Dropout(0.1))
modelcnn.add(Conv1D(128, 5, activation='relu'))
modelcnn.add(MaxPooling1D(pool_size=4))
modelcnn.add(LSTM(100))
modelcnn.add(Dense(2, activation='softmax'))
modelcnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelcnn.summary()
hist = modelcnn.fit(X_train, Y_train, batch_size=32, epochs=5, verbose=1, validation_data=(X_test, Y_test))
score = modelcnn.evaluate(X_test, Y_test, batch_size=32)
print('Test Loss:', score[0])
print('Test Accuracy:', score[1])
Y_pred = modelcnn.predict(X_test)
print(Y_pred)
y_pred = np.argmax(Y_pred, axis=1)
print(y_pred)
plt.title('Accuracy')
plt.plot(hist.history['acc'], label='train')
plt.plot(hist.history['val_acc'], label='test')
plt.show();
from scipy.stats import mode
stack = StackingCVClassifier(lr_pred_tv_test_clntxt,NB_pred_test,rf_pred_test, dtc_pred_test, GBM_pred_test)
stack.fit(X_train_TV_clntxt, Y_train_TV_clntxt)`
stack.predict(X_train_TV_clntxt)
stack.predict(X_test_TV_clntxt)
stack.predict(TV_Mat_unseen)
stack_test = np.array([lr_pred_tv_test_clntxt,NB_pred_test,rf_pred_test, dtc_pred_test, GBM_pred_test]).T
stack_train = np.array([lr_pred_tv_train_clntxt,NB_pred_train,rf_pred_train, dtc_pred_train, GBM_pred_train]).T
stacked_pred_train = mode(stack_train,axis=1)[0]
stacked_pred_test = mode(stack_test,axis=1)[0]
### Train data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,stacked_pred_train))
print("\nTrain DATA ACCURACY",accuracy_score(Y_train_TV_clntxt,stacked_pred_train))
print("\nTrain data f1-score for class '1'",f1_score(Y_train_TV_clntxt,stacked_pred_train, average='weighted'))
### Test data accuracy
print("\n\n--------------------------------------\n\n")
print("TEST DATA ACCURACY",accuracy_score(Y_test_TV_clntxt,stacked_pred_test))
print("\nTest data f1-score for class '1'",f1_score(Y_test_TV_clntxt,stacked_pred_test,pos_label='1'))
print("\nTest data f1-score for class '2'",f1_score(Y_test_TV_clntxt,stacked_pred_test,pos_label='0'))
stack_unseen = np.array([lr_pred_unseen,NB_pred_unseen,rf_pred_unseen, dtc_pred_unseen, GBM_pred_unseen]).T
stack_unseen= pd.DataFrame(stack_unseen)
stack_unseen.head(3)
stacked_pred_unseen = mode(stack_unseen,axis=1)[0]
stacked_pred_unseen.head()
stacked_pred_unseen = pd.DataFrame(stacked_pred_unseen)
stacked_pred_unseen.to_csv("stacking_out.csv")
from IPython.display import display, Image
Image("D:\Data_science\PHD\modelperformance.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\hakunamatata1.png")
from IPython.display import display, Image
Image("D:\Data_science\PHD\Theend.png")